package com.huobiinfo.demo.webmagic;

import com.huobiinfo.demo.mapper.NewsMapper;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

/**
 * @author peter
 * @title: HupuNewsSpider
 * @projectName huobiinfo_webmagic_springboot
 * @description: TODO
 * @date 19-6-14下午2:11
 */
public class HupuNewsSpider implements PageProcessor {

  /*  @Autowired
    private NewsMapper newsMapper;*/

    //抓取网站的相关配置，包括编码，抓取间隔，重试次数等
    private Site site = Site.me().setRetryTimes(3).setSleepTime(100);


    @Override
    public void process(Page page) {
        //文章页，匹配 https://voice.hupu.com/nba/2443026.html
        if (page.getUrl().regex("https://voice.hupu.com/nba/[0-9]{7}.html").match()) {
            //标题
            page.putField("Title", page.getHtml()
                    .xpath("/html/body/div[@class='hp-wrap']/div[@class='voice-main']/div[@class='artical-title']/h1[@class='headline']/text()").toString());

            Selectable content = null;

            //视频文章

            /***
             * /html/body/div[@class='hp-wrap']/div[@class='voice-main']/div[@class='artical-content']/div[@class='artical-content-read']/div[@class='artical-main-content']/p
             */
            try {
                content = page.getHtml()
                        .xpath("/html/body/div[@class='hp-wrap']/div[@class='voice-main']/div[@class='artical-content']/div[@class='artical-content-video']/div[@class='artical-main-content']/p");
            } catch (Exception e) {
                e.printStackTrace();
            }
            System.out.println("content:===========" + (content.toString()));

            if (StringUtils.isEmpty(content.toString())) {
                //文字文章
                content = page.getHtml()
                        .xpath("/html/body/div[@class='hp-wrap']/div[@class='voice-main']/div[@class='artical-content']/div[@class='artical-content-read']/div[@class='artical-main-content']/p/text()");
            }

            page.putField("Content", content.all().toString());
        } else {
            //列表页
            //文章URL(list集合)
            page.addTargetRequests(page.getHtml().xpath("/html/body/div[@class='hp-wrap']/div[@class='voice-main']/div[@class='news-list']/ul/li/div[@class='list-hd']/h4/a/@href").all());
            //翻页url
            page.addTargetRequests(page.getHtml().xpath("/html/body/div[@class='hp-wrap']/div[@class='voice-main']/div[@class='voice-paging']/a[@class='page-btn-prev']/@href").all());
        }
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        Spider.create(new HupuNewsSpider()).addUrl("https://voice.hupu.com/nba/1").addPipeline(new MySqlPipeline()).thread(3).run();
    }

}
